/*This file takes the data at the subsidy, industry, and location level in order to
create an analysis file. The main variables of interest for the analysis file are 
the differences in the location characteristics in the winning location and the 
location characteristics in the runner-up, or any other alternative, location.
inputs:
outputs:
*/

********************************************************************************
*** prep location-industry-occupation vars, aggregate to CZ
********************************************************************************

u $datadir/naics4_occ_vars, clear
rename countyfip fipscty 

merge m:1 statefip fipscty using "$datadir/cty2cz", keep(1 3) nogen 

collapse (sum) naics4_occ_emp* naics3_occ_emp* (mean) naics3_occ_inc* , by(statefip commutingzoneid2000 naics4 year )


label var naics4_occ_emp_count_top_5 "number of employees in top 5 occupations associated with industry (n4)"
label var naics4_occ_emp_count_top_2 "number of employees in top 2 occupations associated with industry (n4)"

label var naics3_occ_emp_count_top_5 "number of employees in top 5 occupations associated with industry (n3)"
label var naics3_occ_emp_count_top_2 "number of employees in top 2 occupations associated with industry (n3)"

foreach v in naics4 naics3 {
		winsor2 `v'_occ_emp_count_top_2, cut(1 99) replace
		winsor2 `v'_occ_emp_count_top_5, cut(1 99) replace
}

foreach v in 2 5 {
		g incwage_top_`v'=naics3_occ_incwage_top_`v'/1000
		winsor2 incwage_top_`v', cut(1 99) replace
}

recode incwage_top_* (0=.)
drop naics3*incwage* 

label var incwage_top_2 "average wage of top 2 occupations associated with industry (n3)"
label var incwage_top_5 "average wage of top 5 occupations associated with industry (n3)"

drop naics4_occ*

sa $temp/emp_wage, replace

********************************************************************************
*** prep location vars, aggregate to CZ
********************************************************************************
u $datadir/location_char, clear 

*** collapse to cz level ***
rename countyfip fipscty 
merge m:1 statefip fipscty using $datadir/cty2cz, keep(3)nogen keepusing(commutingzoneid2000)

//set of variables to sum over CZ 
foreach v in pop n_univ landarea airport_any airport_large top_RD routemilesoffreightrailroad emp {
	bys statefip commutingzoneid2000 year: egen `v'_cz = sum(`v')
	}
	
//distance to the border, use min and mean:
g min_mindist = mindist 
rename mindist mean_mindist 

// take means for the remainder of the variables 
collapse (min) min_mindist (mean) mean_mindist personal_inc_pc density pct_bac_or_more unemp epop zhvi *_zoning *_roadnetwork ///
			ofpoorcondition ofmediumtofair proptax ///
			[aw=pop], by(stateabbrev statefip commutingzoneid2000 year corprate pinc_maxrate r2w term_limit sales_tax_rate ///
				e_comm e_ind statename pop_cz emp_cz n_univ_cz landarea_cz airport_any_cz airport_large_cz top_RD_cz ///
				routemilesoffreightrailroad_cz)
 
g airport_any=(airport_any_cz>0)
drop airport_any_cz 

// fillin housing price data 
sort stateabbrev commutingzone year 
by stateabbrev commutingzoneid2000: replace zhvi=zhvi[_n-1] if zhvi==. //use past housing prices if missing 
by stateabbrev commutingzoneid2000: replace zhvi=zhvi[_n+1] if zhvi==. //use future housing prices if missing 
bys commutingzoneid2000 year: egen med_zhvi = median(zhvi) 
replace zhvi=med_zhvi if zhvi==. // fill in with other part of CZ housing prices if missing 
drop med_zhvi
bys stateabbrev year: egen med_zhvi = median(zhvi)
replace zhvi=med_zhvi if zhvi==.  // fill in with other part of state housing prices if missing 
drop med_zhvi

rename *_cz *

	rename routemiles freightrail
	rename ofpoor bad_bridges 
	rename ofmedium fair_bridges
	
	drop fair_bridges airport_large 

foreach v in personal_inc_pc density zhvi auto_roadnetwork total_roadnetwork freightrail proptax pct_bac_or_more bad_bridges {
winsor2 `v', cuts(1 99) replace
		}
		
sa $temp/location_cz, replace

********************************************************************************
u $datadir/subsidy_data, clear 
		
///Get characteristics of winning place: 
	rename state stateabbrev
	order id stateabbrev fipscty commutingzoneid2000 county year 
	merge m:1 stateabbrev commutingzoneid2000 year using "$temp/location_cz", keep(1 3) nogen ///
			keepusing(personal_inc_pc density pct_bac unemp landarea *_RD airport_any n_univ pop ///
		freightrail bad_bridges *_mindist *roadnetwork *_zoning proptax ///
		zhvi stateabbrev corprate pinc_maxrate r2w term_limit sales_tax_rate ///
			emp	e_comm e_ind term_limit)

	merge m:1 statefip commutingzoneid2000 naics4 year using "$datadir/cz_naics_rect"	, ///
		keep(3) nogen keepusing(wage ln_wage  perc_est_n4) 
	
	merge m:1 statefip commutingzoneid2000 naics4 year using "$temp/emp_wage", ///
		keep(3) nogen	

	
	foreach v in 2 5 {
		rename naics3_occ_emp_count_top_`v' occ_top`v'_naics3
		}
		
	rename * *_win
	rename naics*_win naics* 
	drop diff_present*
	rename year_win year 	
	rename ru_offer ru_offer
	
///Clean up variables for runner-up place:  
	rename ru_state stateabbrev
	
	statastates, abbreviation(stateabbrev)
	drop if _m == 2
	drop _m state_name
	
//prep for runner-up place merge
	ren state_fips statefip
	ren countyname_ru countyname 
	drop ru_county
	replace countyname = "new york city" if countyname=="new york"
	replace countyname = "newport news city" if countyname=="newport news"
	replace countyname = "charlottesville city" if countyname=="charlottesville"
	replace countyname = subinstr(countyname, " parish", "", .)
	merge m:1 statefip countyname using "$datadir/cty2cz", keep(1 3) nogen
	rename commutingzoneid2000 cz_ru
	rename commutingzoneid2000_win cz_win
	ren countyname county_ru
	rename statefip statefip_ru

//Expand so that we have all possible options for firm to consider, not just runner-ups
	joinby year naics4 using "$datadir/cz_naics_rect", unmatched(master)
	assert _m==3
	drop _m

	
//merge in industry-location chars
merge m:1 statefip commutingzoneid2000 naics4 year using "$temp/emp_wage", ///
		keep(3) nogen	
		
	foreach v in 2 5 {
		rename naics3_occ_emp_count_top_`v' occ_top`v'_naics3
		}
		
//don't need winner-winner pairs
	drop if cz_win == commutingzoneid2000 & statefip_win==statefip /*winning place*/
	
//identify runner-ups 
	g runner_up=(cz_ru == commutingzoneid2000) & (statefip_ru == statefip)
	replace county_ru="" if runner_up==0
	bys id: egen max_ru = max(runner_up)
	replace runner_up=1 if max_ru==0 & (statefip_ru == statefip) /*don't forget runner-ups where we don't know the county*/
	sort id commutingzoneid2000 
	by id commutingzoneid2000: g n=_n
	drop if n>1 & runner_up==0 /* drop duplicates */
	g ru_county=(county_ru!="")
	drop stateabbrev cz_ru max_ru n fips statefip_ru
	
//merge in location chars:
	merge m:1 statefip commutingzoneid2000 year using $temp/location_cz, keep(3) nogen  ///
		keepusing(personal_inc_pc density pct_bac unemp landarea *_RD airport_any n_univ pop ///
		freightrail bad_bridges *_mindist *roadnetwork *_zoning proptax ///
		zhvi stateabbrev corprate pinc_maxrate r2w term_limit sales_tax_rate ///
			emp	e_comm e_ind term_limit)
		
//housekeeping
	replace density=density/1000
	replace density_win=density_win/1000

	label var density "persons (1000) per square mile"
	label var density_win "persons (1000) per square mile"

	label var jobs_direct_win "new jobs promised to winning location"
	label var jobs_total_win "total jobs at stake"
	label var jobs_retain_win "jobs retained by subsidy deal"
	rename jobs_*_win jobs_*
	rename firm_win firm 
	
	rename pct_bac_or_more* pr_college* 
	replace bad_bridges= bad_bridges*100
	replace bad_bridges_w = bad_bridges_w*100
	replace perc_est_n4 = perc_est_n4*100
	replace perc_est_n4_win = perc_est_n4_win*100
		
//Scale freight rail variable, property tax variable:
	replace freightrail=freightrail/landarea
	replace freightrail_win = freightrail_win/landarea_win
	replace proptax = (proptax/1000)/zhvi
	replace proptax_win = (proptax_win/1000)/zhvi_win

//use top5 occ wage if industry wage missing
drop incwage_top_2*
	replace wage= incwage_top_5 if wage==. 
	replace wage_win = incwage_top_5_win if wage_win==.

//share of employment in relevant occupations 
g sh_occ = occ_top2_naics3/emp 
	g sh_occ_win = occ_top2_naics3_win/emp_win 

//subsidy vars 
	g jobs_1000=jobs_total/1000
	rename sub_M sub_M
	g ln_sub=log(sub_M) 	
	g ln_invest=log(invest_M+1)
	rename invest_M invest_M
	rename threat_present present_ru
	replace present_ru=. if runner_up==0
	rename win_present present_win 
	g mult_jobs = mult_tot*jobs_1000
	g invest_B = invest_M/1000
	replace jobs_direct=jobs_direct/1000
	replace jobs_retain = jobs_retain/1000
	
//personal income 
	adjust_inflation personal_inc_pc*, year(2017)
	
	g ln_pinc = log(personal_inc_pc/1000)
	g ln_pinc_win = log(personal_inc_pc_win/1000)

	
//industry types
	sort id stateabbrev 
	g type=1 if inlist(naics4, 2111, 2211, 3241, 3251, 3252, 3253, 3254, 3259) /*high-tech manuf*/
	replace type=1 if naics3==334  /*high-tech manuf*/
	replace type=3 if inlist(naics4, 5112, 5173, 5511) | naics3==541 | naics3==523 /*high-tech services*/
	replace type=4 if naics4>4000 & type==.  /* services */
	replace type=2 if type==. /*manuf*/

g manuf=(naics2==31 | type==1 | type==2)

/*two types of manufacturing, traditional and non*/
g manuf_type = 1 if inlist(naics4,3254, 3341, 3342, 3344, 3345, 3364)
replace manuf_type = 3 if manuf_type==. & manuf==1
*Services: trade vs services
g trade = inrange(naics3,400,500)

g low_serv = (trade==1 | type==4) 
replace low_serv=. if manuf==1 


// create difference variables for profit estimation
	foreach v in pinc_maxrate sales_tax_rate corprate r2w personal_inc_pc wage perc_est_n4 ///
		density  pr_college bad_bridges ///
		zhvi ln_wage n_univ airport_any top_RD  e_comm e_ind auto_roadnetwork ///
		total_roadnetwork freightrail proptax indus_zoning comm_zoning ///
		 occ_top2_naics3 occ_top5_naics3 incwage_top_5 sh_occ {
		g diff_`v' = `v' - `v'_win 
		}
		
	rename *pinc_maxrate* *income_tax*
	rename *corprate* *corp_tax*
	rename *sales_tax_rate* *sales_tax*
	
*tag runner-ups 
	gsort id stateabbrev runner_up -est_n4
	by id stateabbrev runner_up: g n=_n
	g ru_limit=(n==1) /*one runner_up for each state, when we don't know the county */
	replace ru_limit=0 if runner_up==0
	drop n 
	bys id ru_limit: gen weight=1/_N
	replace weight=. if runner_up==0
	drop runner_up 
	rename commutingzoneid2000 cz

	rename *zhvi* *FHFA* 

	adjust_inflation FHFA* diff_FHFA e_ind* diff_e_ind wage* diff_wage, year(2017)
	
drop total_roadnetwork*
rename n_bid_win n_bid

foreach v in diff_pr_college pr_college pr_college_win diff_proptax proptax proptax_win diff_sh_occ {
	replace `v'=`v'*100
	}
	
foreach v in diff_occ_top2_naics3 occ_top2_naics3 occ_top2_naics3_win {
		replace `v'=`v'/1000
}
	
foreach v in diff_indus_zoning indus_zoning indus_zoning_win {
		replace `v'=`v'/6
}

rename mult_*_win mult_*
//recoding to mean
recode diff_indus_zoning (.=0)
recode indus_zoning indus_zoning_win (.=0.5)
	
drop state_name_win countyname_win unique_id_win 

//labeling and organizing: 
order *, alphabetic
	order id_win year firm sub_M stateabbrev_win statefip_win cz_win county_win fipscty_win stateabbrev statefip cz county_ru ru_county fipscty ru_limit n_bid orig_sub text_n ru_offer jobs* invest* naics* mult*  ln_sub ln_invest manuf manuf_type trade low_serv type *_win
	order diff*, last 

	winsor2 invest_B, cut(1 99) replace 
	
	label var ln_sub "log(subsidy)"
	label var ln_invest "log(investment promised)"
label var id_win "ID for subsidy deal"
label var stateabbrev_win "subsidy deal state"
label var statefip_win "subsidy deal state FIPS"
label var cz_win "subsidy deal CZ"
label var county_win "subsidy deal county"
label var fipscty_win "subsidy deal county FIPS"
label var stateabbrev "state"
label var statefip "state FIPS"
label var cz "CZ"
label var county_ru "county name if runner-up"
label var ru_county "dummy for whether runner-up county is known from data"
replace ru_county=. if ru_limit==0
drop fipscty 
label var jobs_1000 "total jobs (1,000)"
label var mult_supplier "jobs multiplier via supplier"
label var mult_induced "jobs multiplier via induced demand"
label var mult_total "total multiplier"
label var mult_jobs "multiplier x jobs promised"
label var invest_B "investment planned ($ B), winsorized at 99%"
label var naics4 "4-digit NAICS code"
label var naics3 "3-digit NAICS code"
label var naics2 "2-digit NAICS code"
label var ru_limit "runner-up dummy" 
label var manuf "manufacturing dummy"
label var manuf_type "1 = High-tech manufacturing, 3= traditional manufacturing"
label var trade "trade dummy"
label var low_serv "low-skilled services dummy"
label var type "1=High-tech manuf, 2=high-skill serv, 3=trad. manuf, 4=low-skill services"
rename orig_sub_M* orig_sub_M 
rename text_n_bid* text_n_bid 
label var pop_win "population of CZ"
label var landarea_win "land area in CZ"
label var n_univ_win "number of universities"
label var top_RD_win "top R&D universities"
label var emp_win "total employment in CZ"
label var freightrail_win "route miles of freight rail"
label var personal_inc_pc_win "personal income per capita" 
label var min_mindist_win "minimum distance to border"
label var mean_mindist_win "mean of min. distance to border"
label var pop "population of CZ"
label var landarea "land area in CZ"
label var n_univ "number of universities"
label var top_RD "top R&D universities"
label var emp "total employment in CZ"
label var freightrail "route miles of freight rail"
label var personal_inc_pc "personal income per capita" 
label var min_mindist "minimum distance to border"
label var mean_mindist "mean of min. distance to border"
label var weight "weight to runner-up, in case of multiple runnerups"
label var FHFA_win "average housing price ($ 1000)"
label var airport_any_win  "airport dummy"
label var auto_roadnetwork_win  "auto roadnetwork density"
label var FHFA "average housing price ($ 1000)"
label var airport_any  "airport dummy"
label var auto_roadnetwork  "auto roadnetwork density"
label var bad_bridges_win "% of bridges in poor condition"
label var comm_zoning_win "commercial zoning index"
label var indus_zoning_win "industrial zoning index"
label var ln_pinc_win "log(per capita income)"
label var ln_wage_win "log(average local wage, $1,000)"
drop wage_county_win wage_cz_win 
label var wage_win "average local wage ($1,000)"
label var bad_bridges "% of bridges in poor condition"
label var comm_zoning "commercial zoning index"
label var indus_zoning "industrial zoning index"
label var ln_pinc "log(per capita income)"
label var ln_wage "log(average local wage, $1,000)"
label var wage "average local wage ($1,000)"
label var sh_occ_win "population in relevant occupations (%)"
label var proptax_win "property tax payment divided by housing price"
label var pr_college_win "% of pop. with college degree"
label var present_win "firm is present in subsidized locale"
label var unemp_win "unemployment rate (%)"
label var sh_occ "population in relevant occupations (%)"
label var proptax "property tax payment divided by housing price"
label var pr_college "% of pop. with college degree"
label var present_ru  "firm is present in runner-up locale"
label var unemp "unemployment rate (%)"

sa "$datadir/analysis_cz", replace
	
//now also reshape to long, for ease of making some figures
*********************************************************************************
//deal specific variables 
u "$datadir/analysis_cz", clear
	rename id_win id_deal 
	*deal specific vars: 
	keep id_deal year n_bid jobs_direct sub_M firm invest_B mult_jobs naics4 jobs_retain jobs_total trade manuf_type low_serv manuf mult_tot
	duplicates drop 
	sa $temp/dealvar, replace 
	
	
//charactersitics for winning place 	
u "$datadir/analysis_cz", clear
	rename id_win id_deal 
	drop diff* weight naics2 ln_invest manuf manuf_type trade low_serv county* fipscty* ru_offer mult* present*
	keep id_deal *_win 
	duplicates drop 
	rename *_win *
	g winner=1 
	sa $temp/winvar, replace
	
//characteristics for everyone else 
	u "$datadir/analysis_cz", clear
	rename id_win id_deal 
	drop diff* weight naics2 ru_county ln_invest manuf manuf_type trade low_serv *_win  text_n_bid  year n_bid jobs_direct sub_M firm invest_B mult_jobs invest_M naics4 jobs_retain jobs_total county_ru  ru_offer mult* est_n4 present*
	g runnerup=(ru_limit==1)
	drop ru_limit
	duplicates drop 
	duplicates tag id_deal stateabbrev statefip cz, gen(dup)
	drop if dup==1 & runnerup==0
	drop dup
	append using $temp/winvar 
	
	merge m:1 id_deal using $temp/dealvar , nogen 
	
	recode runnerup winner (.=0)
	
	order id_deal firm year sub_M n_bid jobs* mult_jobs  mult_tot invest_B naics4 winner runnerup stateabbrev statefip cz 
	
	g event=(runnerup==1 | winner==1)
	bys id_deal: egen n_event = sum(event)

	replace n_bid=30 if n_bid>30
	
	drop if n_event==n_bid & event==0
	g n_sim_shortlist=n_bid - n_event 
	drop n_event event 
	
	replace jobs_1000 = jobs_total/1000 
	
	label var winner "subsidy deal winner"
	label var runnerup "subsidy deal runnerup"
	label var n_sim_shortlist "number of locations to simulate for shortlist"
	sa $datadir/analysis_cz_long, replace 

	

	